Packages needed
library(streamR)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
library(ROAuth)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Should we get more tweets?
rerun_tweet <- FALSE
# # urls (straight from streamR docs)
# requestURL <- "https://api.twitter.com/oauth/request_token"
# accessURL <- "https://api.twitter.com/oauth/access_token"
# authURL <- "https://api.twitter.com/oauth/authorize"
#
# # I store all my apikeys in one place
# source("~/.apikeys/twitter_keys")
#
# # set up an oauth
# my_oauth <- OAuthFactory$new(
# consumerKey = twitter_consumer_key,
# consumerSecret = twitter_consumer_secret,
# requestURL = requestURL,
# accessURL = accessURL,
# authURL = authURL)
#
# # get the oauth
# my_oauth$handshake(
# cainfo = system.file("CurlSSL", "cacert.pem",
# package = "RCurl")
# )
#
# # save it (and make sure to add this to .gitignore)
# save(my_oauth, file = "keys/my_oauth.Rdata")
Now I will open a connection and for the set timeout time, scrape tweets. The bounding box for Switzerland was pulled from an online repo of country bounding boxes.
# load("keys/my_oauth.Rdata")
#
# filterStream(
# # If the file already exists, tweets will be appended (not overwritten).
# "data/tweetsSwiss.json",
# locations = c(6.02260949059, 45.7769477403, 10.4427014502, 47.8308275417),
# timeout = 600,
# oauth = my_oauth)
df_tweets <- rbind(
parseTweets("data/tweetsSwiss.json", verbose = TRUE),
parseTweets("data/tweetsSwiss_part2.json", verbose = TRUE),
parseTweets("data/tweetsSwiss_part3.json", verbose = TRUE),
parseTweets("data/tweetsSwiss_part4.json", verbose = TRUE),
parseTweets("data/tweetsSwiss_part5.json", verbose = TRUE)
)
## 17280 tweets have been parsed.
## 8146 tweets have been parsed.
## 4852 tweets have been parsed.
## 7999 tweets have been parsed.
## 7095 tweets have been parsed.
df_tweets_ch <- df_tweets %>%
mutate(
language = case_when(
.$lang %in% c(
"en","en-gb","en-GB"
) ~ "English",
.$lang == "de" ~ "Deutsch",
.$lang == "it" ~ "Italian",
.$lang == "fr" ~ "French",
.$lang == "und" ~ "Unknown",
TRUE ~ "Other"
),
user_language = case_when(
.$user_lang %in% c(
"en","en-gb","en-GB"
) ~ "English",
.$user_lang == "de" ~ "Deutsch",
.$user_lang == "it" ~ "Italian",
.$user_lang == "fr" ~ "French",
.$user_lang == "und" ~ "Unknown",
TRUE ~ "Other"
)
) %>%
filter(
country_code == "CH"
)
jb_percent <- function(x){paste0(format(x, big.mark = ",")," (",round(100*x/sum(x)),"%)")}
# Tweets
left_join(
df_tweets_ch %>%
group_by(user_language) %>%
summarise(
Users = n_distinct(screen_name)
) %>%
arrange(-Users) %>%
mutate(`Users` = jb_percent(Users)),
df_tweets_ch %>%
group_by(language) %>%
summarise(
Tweets = n()
) %>%
mutate(Tweets = jb_percent(Tweets)),
by = c("user_language" = "language")
) %>% rename(Language = user_language) %>%
knitr::kable(
caption = "Count of users and tweets (users refers to language the user has enabled in twitter, tweets is based off language of each individual tweet)."
)
| Language | Users | Tweets |
|---|---|---|
| English | 1,806 (41%) | 7,559 (40%) |
| Deutsch | 1,092 (25%) | 2,529 (13%) |
| French | 802 (18%) | 2,767 (15%) |
| Other | 456 (10%) | 3,522 (18%) |
| Italian | 209 (5%) | 473 (2%) |
# Who tweets in different language from website setting
# do median proportio
df_tweets_ch %>%
# remove unknown (usually links)
filter(language != "Unknown") %>%
mutate(
wronglang = ifelse(user_language != language,T,F)
) %>%
group_by(user_language,screen_name) %>%
summarise(
wronglang = sum(wronglang),
`Wrong language` = sum(wronglang > 0)
) %>%
group_by(user_language,`Wrong language`) %>%
summarise(
`Count` = n()
) %>%
mutate(Count = round(100*Count/sum(Count))) %>%
arrange(-Count) %>%
mutate(`Count` = paste0(Count,"%")) %>%
ungroup() %>%
rename(
`UI language` = user_language,
`Tweeted in a different language` = Count
) %>% filter(`Wrong language` == 1) %>%
select(`UI language`, `Tweeted in a different language`) %>%
knitr::kable(
caption = "Proportion of twitter users that have tweeted in a language other than their website setting."
)
| UI language | Tweeted in a different language |
|---|---|
| Italian | 56% |
| Deutsch | 52% |
| French | 45% |
| Other | 44% |
| English | 37% |
# And what language do they tweet?
df_tweets_ch %>%
filter(
user_language != language
) %>%
group_by(user_language, language) %>%
summarise(
Count = n_distinct(screen_name)
) %>%
arrange(-Count) %>%
rename(
`Twitter setting` = user_language,
`Tweet language` = language
) %>%
mutate(`Count` = jb_percent(Count))
## Source: local data frame [25 x 3]
## Groups: Twitter setting [5]
##
## `Twitter setting` `Tweet language` Count
## <chr> <chr> <chr>
## 1 Deutsch English 414 (53%)
## 2 English Other 303 (32%)
## 3 French English 260 (46%)
## 4 English Deutsch 249 (27%)
## 5 English Unknown 215 (23%)
## 6 Deutsch Unknown 195 (25%)
## 7 French Unknown 160 (28%)
## 8 English French 141 (15%)
## 9 Other English 139 (42%)
## 10 Deutsch Other 129 (16%)
## # ... with 15 more rows
df_tweets_ch %>%
filter(
user_language != language
) %>%
group_by(user_language, lang) %>%
summarise(
Count = n_distinct(screen_name)
) %>%
arrange(-Count) %>%
rename(
`Twitter setting` = user_language,
`Tweet language` = lang
) %>%
mutate(`Count` = jb_percent(Count)) %>%
DT::datatable()
# swiss
df_tweets %>% filter(country_code == "CH") %>% nrow()
## [1] 19057
df_tweets %>%
group_by(country_code, lang) %>%
summarise(
n = n(),
n_users = n_distinct(user_id_str)
) %>%
mutate(
`% of users` = round(100*n_users/sum(n_users),1)
) %>%
arrange(country_code, -n_users) %>%
DT::datatable()
df_tweets %>%
filter(country_code == "CH") %>%
group_by(lang) %>%
summarise(
n = n(),
n_users = n_distinct(user_id_str)
) %>%
mutate(
`% of tweets` = round(100*n/sum(n),1),
`% of users` = round(100*n_users/sum(n_users),1)
) %>%
arrange(-n_users) %>%
DT::datatable()
library(sp)
d_ch <- raster::getData("GADM", country="CH", level=0)
p_ch <- sp::spTransform(d_ch, sp::CRS("+init=EPSG:32737"))
# convert data
df_plotdata <- df_tweets_ch %>%
select(
lon = place_lon,
lat = place_lat,
country_code,
screen_name,
Language = language
) %>%
na.omit
coordinates(df_plotdata) <- 1:2
proj4string(df_plotdata) <- sp::CRS("+proj=longlat +datum=WGS84")
df_plotdata <- sp::spTransform(df_plotdata, CRS("+init=EPSG:32737"))
df_plotdata <- as.data.frame(df_plotdata) %>%
filter(country_code == "CH" & Language %in%
c("Deutsch","English","French","Italian")) %>%
group_by(screen_name) %>%
slice(1)
# Blank theme
new_theme_empty <- theme_bw()
new_theme_empty$line <- element_blank()
new_theme_empty$rect <- element_blank()
new_theme_empty$strip.text <- element_blank()
new_theme_empty$axis.text <- element_blank()
new_theme_empty$plot.title <- element_blank()
new_theme_empty$axis.title <- element_blank()
new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1), unit = "lines", valid.unit = 3L, class = "unit")
ggplot() +
#geom_polygon(data=p_ch, aes(long,lat,group=group), fill="whitesmoke") +
geom_path(
data=p_ch,
aes(long,lat, group=group),
color="grey",
size=0.1) +
geom_point(
data = df_plotdata,
aes(
x = lon, y = lat, colour = Language #, shape = Language
),
alpha = 0.5
) +
scale_colour_manual(
values = c(
Deutsch = "#ff0000",
English = "#808080",
French = "#0000ff",
Italian = "#009900"
)
) +
coord_equal() +
new_theme_empty
## Regions defined for each Polygons
jb_plot <- function(Language_filter = "English"){
ggplot() +
geom_polygon(
data=p_ch,
aes(x=long,y=lat,group=group),
fill="black") +
stat_bin2d(
data=df_plotdata %>%
filter(Language %in% c(Language_filter)),
aes(x=lon,y=lat),
bins=60) +
geom_path(data=p_ch, aes(x=long,y=lat,group=group), colour='white') +
#geom_polygon(data=outside, aes(x=long,y=lat), fill='white') +
coord_equal() +
new_theme_empty +
theme(
panel.background = element_rect(fill='black', colour='black'),
legend.position = "none"
) +
labs(title = paste(Language_filter,"tweet distribution")) +
annotate("text", x = -1990000, y=15770000, label = Language_filter, colour = "white", size = 8)
}
jb_plot("English")
## Regions defined for each Polygons
## Regions defined for each Polygons
jb_plot("French")
## Regions defined for each Polygons
## Regions defined for each Polygons
jb_plot("Deutsch")
## Regions defined for each Polygons
## Regions defined for each Polygons
jb_plot("Italian")
## Regions defined for each Polygons
## Regions defined for each Polygons
TAKE AREA, and ratio of each language!
theme_map <- function(...) {
theme_minimal() +
theme(
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
# panel.grid.minor = element_line(color = "#ebebe5", size = 0.2),
panel.grid.major = element_line(color = "#ebebe5", size = 0.2),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
panel.border = element_blank(),
...
)
}
library(rgdal)
## rgdal: version: 1.2-7, (SVN revision 660)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.1.2, released 2016/10/24
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.3/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.3/Resources/library/rgdal/proj
## Linking to sp version: 1.2-4
library(raster)
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
library(cartography)
library(dplyr)
library(viridis)
## Loading required package: viridisLite
df_tweets_ch_canton <- df_tweets_ch %>% ungroup
coordinates(df_tweets_ch_canton) <- ~place_lon+place_lat
proj4string(df_tweets_ch_canton) <- sp::CRS("+proj=longlat +datum=WGS84")
swiss_shape <- readRDS("CHE_adm1.rds")
df_tweets_ch_canton <- spTransform(df_tweets_ch_canton, proj4string(swiss_shape))
# this does the lon/lat to zip mapping
df_tweets_ch_canton <- df_tweets_ch_canton %over% swiss_shape
df_tweets_ch_mapped <- cbind(df_tweets_ch,df_tweets_ch_canton)
df_tweets_ch_mapped <- df_tweets_ch_mapped %>%
group_by(NAME_1,language) %>%
summarise(
n = n()
) %>%
group_by(NAME_1) %>%
mutate(
Proportion = 100*n/sum(n)
)
# read in background relief
relief <- raster("02-relief-georef-clipped-resampled.tif")
relief_spdf <- as(relief, "SpatialPixelsDataFrame")
# relief is converted to a very simple data frame,
# just as the fortified municipalities.
# for that we need to convert it to a
# SpatialPixelsDataFrame first, and then extract its contents
# using as.data.frame
relief <- as.data.frame(relief_spdf) %>%
rename(value = `X02.relief.georef.clipped.resampled`)
# remove unnecessary variables
rm(relief_spdf)
map_data <- df_tweets_ch_mapped %>%
ungroup
# fortify, i.e., make ggplot2-compatible
map_data_fortified <- fortify(swiss_shape, region = "NAME_1")
# now we join the thematic data
map_data <- map_data_fortified %>%
left_join(map_data, by = c("id" = "NAME_1")) %>%
na.omit()
plot_canton <- function(the_language = "Deutsch"){
temp_map_data <- map_data %>% filter(language == the_language) %>%
# drop if less than 10
filter(n >= 10)
# coordinates for data
coordinates(temp_map_data) = ~long + lat
proj4string(temp_map_data) <- sp::CRS("+proj=longlat +datum=WGS84")
temp_map_data <- as.data.frame(sp::spTransform(temp_map_data, CRS("+init=EPSG:21781")))
# coordinates for relief
relief_proj <- relief
coordinates(relief_proj) = ~x + y
proj4string(relief_proj) <- sp::CRS("+init=epsg:21781")
relief_proj <- as.data.frame(relief_proj)
#relief_proj <- as.data.frame(sp::spTransform(relief_proj, CRS("+init=EPSG:32737")))
# labels
# same code as above but different breaks
pretty_breaks <- c(10,20,30,40,50,60)
# find the extremes
minVal <- min(temp_map_data$Proportion, na.rm = T)
maxVal <- max(temp_map_data$Proportion, na.rm = T)
# compute labels
labels <- c()
brks <- c(0, pretty_breaks, 70)
# round the labels (actually, only the extremes)
for(idx in 1:length(brks)){
labels <- c(labels,round(brks[idx + 1], 2))
}
labels <- labels[1:length(labels)-1]
# define a new variable on the data set just as above
temp_map_data$brks <- cut(temp_map_data$Proportion,
breaks = brks,
include.lowest = TRUE,
labels = labels)
brks_scale <- levels(temp_map_data$brks)
labels_scale <- rev(brks_scale)
# plot
ggplot() +
geom_raster(
data = relief_proj,
aes(x = x,
y = y,
alpha = value)) +
scale_alpha(name = "", range = c(0.6, 0), guide = F) +
geom_polygon(data = temp_map_data,
aes(fill = brks,
x = long,
y = lat,
group = group),
alpha = 0.9) +
# municipality outline
geom_path(data = temp_map_data,
aes(x = long,
y = lat,
group = group),
color = "white", size = 0.1) +
coord_equal() +
theme_map() +
theme(
legend.position = c(0.5, 0.03),
legend.text.align = 0,
legend.background = element_rect(fill = alpha('white', 0.0)),
legend.text = element_text(size = 7, hjust = 0, color = "#4e4d47"),
plot.title = element_text(hjust = 0.5, color = "#4e4d47"),
plot.subtitle = element_text(hjust = 0.5, color = "#4e4d47",
margin = margin(b = -0.1,
t = -0.1,
l = 2,
unit = "cm"),
debug = F),
legend.title = element_text(size = 8),
plot.margin = unit(c(.5,.5,.2,.5), "cm"),
panel.spacing = unit(c(-.1,0.2,.2,0.2), "cm"),
panel.border = element_blank(),
plot.caption = element_text(size = 6,
hjust = 0.92,
margin = margin(t = 0.2,
b = 0,
unit = "cm"),
color = "#939184")
) +
labs(x = NULL,
y = NULL,
title = paste(the_language,"tweeters (%)")
) +
scale_fill_manual(
values = rev(magma(8, alpha = 0.8)[1:7]),
breaks = rev(brks_scale),
name = paste(""),
drop = FALSE,
labels = labels_scale,
guide = guide_legend(
direction = "horizontal",
keyheight = unit(2, units = "mm"),
keywidth = unit(70/length(labels), units = "mm"),
title.position = 'top',
title.hjust = 0.5,
label.hjust = 1,
nrow = 1,
byrow = T,
reverse = T,
label.position = "bottom"
)
)
}
plot_canton("English")
plot_canton("French")
plot_canton("Deutsch")
plot_canton("Italian")
#######
v_latlon_steps <- df_plotdata %>% ungroup %>%
summarise(
lat_min = min(lat),
lat_max = max(lat),
lon_min = min(lon),
lon_max = max(lon)
) %>%
mutate(
lat = (lat_max - lat_min)/25,
lon = (lon_max - lon_min)/25
)
for(i in 1:nrow(df_plotdata)){
}
temp_i <- df_plotdata %>%
filter(Language == "English")
temp_i <- temp_i[1,]
temp_ii <- df_plotdata %>%
filter(
lon >= (temp_i$lon - v_latlon_steps$lon) &
lon <= (temp_i$lon + v_latlon_steps$lon) &
lat >= (temp_i$lat - v_latlon_steps$lat) &
lat <= (temp_i$lat + v_latlon_steps$lat)
) %>% ungroup %>%
summarise(
n = n(),
lang_n = sum(Language == "English")
) %>%
mutate(percent = 100*lang_n/n)